###########################################################
# File:     01-main-analysis.R
# Title:    La Raza
# Authors:  Muzquiz & Alcocer
# Date:     28-MAY-2025
# Purpose:  to analyze audit experiment data on
#           legislators' responsiveness to constituents
#           based on racial cues.
###########################################################

#====================
# SET UP ENVIRONMENT
#====================

# Clear all objects from the current workspace to ensure a clean environment
rm(list = ls())

# Set working directory to the folder containing replication materials
setwd('/Users/josealcocer/Library/CloudStorage/Dropbox/All School Files/USC PHD/Files/Non-Class Material/Research/La Raza/replication_files')

# Load required packages
library(readr)      # for reading CSV files
library(stargazer)  # for regression table output
library(texreg)     # alternative regression table formatting
library(pwr)        # for power analysis (if used later)
library(dplyr)      # for data manipulation
library(janitor)    # for cleaning and managing data frames
library(xtable)     # for LaTeX table output
library(sandwich)   # for robust standard errors
library(lmtest)     # for hypothesis testing
library(ggplot2)    # for plotting
library(broom)      # for tidying model outputs
library(car)        # for linear hypothesis testing

# Load the dataset
data <- read_csv("data.csv", show_col_types = FALSE)

#======================================
# COMPUTE DESCRIPTIVE STATISTICS TABLE
#======================================

# Summarise key variables by treatment group
# (0 = Indigenous, 1 = Mestiza, 2 = White European)
reply_proportions <- data %>%
  group_by(treatment) %>%
  summarise(
    `Total Legislators (N)` = as.integer(n()),
    `Coalitions` = NA,  # Placeholder for potential future variable
    `MORENA` = round(mean(opposition == 0), 2),
    `Opposition` = round(mean(opposition), 2),
    `Percentage of Indigenous Population` = NA,
    `Low` = round(mean(ind_pop == 1), 2),
    `Medium` = round(mean(ind_pop == 2), 2),
    `High` = round(mean(ind_pop == 3), 2),
    `Gender` = NA,  # Placeholder header row
    `Women` = round(mean(woman == 1), 2),
    `Men` = round(mean(woman == 0), 2)
  ) %>%
  mutate(
    # Convert numeric treatment values into readable labels
    Treatment = dplyr::recode(treatment,
      `0` = "Indigenous",
      `1` = "Mestiza",
      `2` = "White European"
    )
  )

# Transpose table for LaTeX output,
#  making rows = variable categories and columns = treatment groups
reply_proportions_df <- as.data.frame(t(reply_proportions))
reply_proportions_df <- row_to_names(reply_proportions_df, row_number = 1)

# Create a column for overall totals/statistics (across all treatments)
df2 <- data.frame("Total" = c(nrow(data),
                              NA,
                              mean(data$opposition == 0),
                              mean(data$opposition == 1),
                              NA,
                              mean(data$ind_pop == 1),
                              mean(data$ind_pop == 2),
                              mean(data$ind_pop == 3),
                              NA,
                              mean(data$woman == 1),
                              mean(data$woman == 0),
                              NA))

# Combine treatment and total columns into final table for publication
reply_proportions_df <- cbind(reply_proportions_df, df2)

# Generate and save LaTeX Table 1 Descriptive Statistics of Experimental Sample
writeLines(
  capture.output(xtable(reply_proportions_df)),
  "output/descriptive_statistics_table.tex"
)

#=======================
# CONDUCT BALANCE TESTS
#=======================

# Test for independence b/w sex and treatment groups w/ chi-squared test
gender_treatment <- table(data$woman, data$treatment)
gender_test <- chisq.test(table(data$woman, data$treatment))

# Test for independence b/w coalition affiliation & treatment
coalition_treatment <- table(data$opposition, data$treatment)
coalition_test <- chisq.test(table(data$opposition, data$treatment))

# Test for independence b/w indigenous population group & treatment
population_treatment <- table(data$ind_pop, data$treatment)
population_test <- chisq.test(table(data$ind_pop, data$treatment))

# Construct the results data frame
chi2_results <- data.frame(
  Covariate = c("Gender (Male vs. Female)",
                "Coalition (MORENA vs. Opposition)",
                "Indigenous Population (Low, Med, High)"),
  `Chi-Squared Statistic` = round(c(gender_test$statistic,
                                    coalition_test$statistic,
                                    population_test$statistic), 2),
  df = c(gender_test$parameter,
         coalition_test$parameter,
         population_test$parameter),
  `p-value` = round(c(gender_test$p.value,
                      coalition_test$p.value,
                      population_test$p.value), 2),
  check.names = FALSE
)

# Generate LaTeX Appendix Table D Balance Tests
#  for Key Covariates Across Treatment Groups
writeLines(
  capture.output(xtable(chi2_results,
    align = c("l", "l", "c", "c", "c")
  )),
  "output/balance_test_results.tex"
)

#==========================
# GET RESPONSE PROPORTIONS
#==========================

# Compute mean response rate (proportion of replies) within each treatment group
reply_proportions <- data %>%
  group_by(treatment) %>%
  summarise(
    Proportion_Reply = round(mean(response), 2),
    N = as.integer(n())  # number of observations per treatment
  )

# Prepare the table for publication by transposing and labeling treatment groups
reply_proportions_df <- as.data.frame(t(reply_proportions))
colnames(reply_proportions_df) <- c("Indigenous", "Mestiza", "White European")
reply_proportions_df <- reply_proportions_df[-1, ]

# Generate LaTeX Table 2 Response Rates Across Treatment Groups
writeLines(
  capture.output(xtable(reply_proportions_df)),
  "output/response_proportions_table.tex"
)

#========================
# CONDUCT POWER ANALYSIS
#========================

# Perform post-hoc power analysis to estimate ability
# to detect a small effect (Cohen's h = 0.20)
# This helps determine whether each sample size has
# sufficient power

# Define parameters
h_small <- 0.20
alpha <- 0.05

# Compute power
indig_power <- pwr.2p.test(h = h_small, n = 192, sig.level = alpha)
mestiza_power <- pwr.2p.test(h = h_small, n = 207, sig.level = alpha)

# Format table
power_results <- data.frame(
  `Treatment Group` = c("Indigenous", "Mestiza"),
  `Effect Size ($h$)` = c("0.20", "0.20"),
  `Sample Size ($n$)` = c("192", "207"),
  `Significance Level ($\\alpha$)` = c("0.05", "0.05"),
  `Power` = c("0.500", "0.530"),
  check.names = FALSE
)

# Print LaTeX table F. Power Analysis for Minimum Detectable Effects
writeLines(
  capture.output(xtable(power_results,
         caption = "Post Hoc Power Analysis for Small Effect Size (Cohen's $h = 0.2$)",
         label = "tab:power_analysis",
         align = c("l", "l", "c", "c", "c", "c"),
    include.rownames = FALSE,
    sanitize.colnames.function = identity,
    caption.placement = "top"
  )),
  "output/power_analysis.tex"
)

#=======================
# CONDUCT MAIN ANALYSIS
#=======================

# Mutate MORENA Party var and Recode Treatment var
data <- data %>%
  mutate(MORENA_Coalition = ifelse(opposition == 0, 1, 0),
         Treatment_Recoded = case_when(
           treatment == 0 ~ 1,
           treatment == 1 ~ 2,
           treatment == 2 ~ 0
         ),
         Treatment_Recoded = as.factor(Treatment_Recoded))

levels(data$Treatment_Recoded) <- c("Baseline", "Indigenous", "Mestiza")

# Response variable
model_1 <- lm(response ~ Treatment_Recoded, data = data)
robust_se_1 <- sqrt(diag(vcovHC(model_1, type = "HC")))

# Greeting variable
model_2 <- lm(greeting ~ Treatment_Recoded, data = data)
robust_se_2 <- sqrt(diag(vcovHC(model_2, type = "HC")))

# Warmth variable
model_3 <- lm(warmth ~ Treatment_Recoded, data = data)
robust_se_3 <- sqrt(diag(vcovHC(model_3, type = "HC")))

# Inquiry variable
model_4 <- lm(inquiry ~ Treatment_Recoded, data = data)
robust_se_4 <- sqrt(diag(vcovHC(model_4, type = "HC")))

# Generate LaTeX table Table 3 Main Analysis Results for All Dependent Variables
stargazer(model_1, model_2, model_3, model_4, type = "latex",
          title = "Main Analysis Results for All Dependent Variables",
          se = list(robust_se_1, robust_se_2, robust_se_3, robust_se_4),
          dep.var.labels = c("Response Received",
                             "Name Greeting",
                             "Warm Response",
                             "Resolved Inquiry"),
          covariate.labels = c("Treatment (Indigenous)",
                               "Treatment (Mestiza)"),
          omit.stat = c("f", "ser"),
          no.space = TRUE,
          out = "output/main_models_table.tex")

# Perform linear hypothesis test: Indigenous = Mestiza
# This tests whether the treatment effects for Indigenous and
# Mestiza are statistically different
# This is useful for understanding the relative impact of each treatment
# Results: No significant difference found
# reported in main text
linearHypothesis(model_2,
  "Treatment_RecodedIndigenous = Treatment_RecodedMestiza"
)

#==============================
# CONDUCT INTERACTION ANALYSES
#==============================

#-----------------------------------------
# COALITION AFFILIATION TREATMENT EFFECTS
#-----------------------------------------

# Response variable
model_5 <- lm(response ~ Treatment_Recoded * MORENA_Coalition, data = data)
robust_se_5 <- sqrt(diag(vcovHC(model_5, type = "HC")))

# Greeting variable
model_6 <- lm(greeting ~ Treatment_Recoded * MORENA_Coalition, data = data)
robust_se_6 <- sqrt(diag(vcovHC(model_6, type = "HC")))

# Warmth variable
model_7 <- lm(warmth ~ Treatment_Recoded * MORENA_Coalition, data = data)
robust_se_7 <- sqrt(diag(vcovHC(model_7, type = "HC")))

# Inquiry variable
model_8 <- lm(inquiry ~ Treatment_Recoded * MORENA_Coalition, data = data)
robust_se_8 <- sqrt(diag(vcovHC(model_8, type = "HC")))

# Generate LaTeX Table 4 Coalition-Conditional Results for All Dependent Variables
stargazer(model_5, model_6, model_7, model_8, type = "latex",
          title = "Coalition-Conditional Results for All Dependent Variables",
          se = list(robust_se_5, robust_se_6, robust_se_7, robust_se_8),
          dep.var.labels = c("Response Received",
                             "Name Greeting",
                             "Warm Response",
                             "Resolved Inquiry"),
          covariate.labels = c("Treatment (Indigenous)",
                               "Treatment (Mestiza)",
                               "Coalition (MORENA)",
                               "Treatment (Indigenous) x Coalition (MORENA)",
                               "Treatment (Mestiza) x Coalition (MORENA)"),
          omit.stat = c("f", "ser"),
          no.space = TRUE,
          out = "output/coalition_interaction_models_table.tex")

#-----------------------------------------
# INDIGENOUS POPULATION TREATMENT EFFECTS
#-----------------------------------------

# Response variable
model_9 <- lm(response ~ Treatment_Recoded * ind_pop, data = data)
robust_se_9 <- sqrt(diag(vcovHC(model_9, type = "HC")))

# Greeting variable
model_10 <- lm(greeting ~ Treatment_Recoded * ind_pop, data = data)
robust_se_10 <- sqrt(diag(vcovHC(model_10, type = "HC")))

# Warmth variable
model_11 <- lm(warmth ~ Treatment_Recoded * ind_pop, data = data)
robust_se_11 <- sqrt(diag(vcovHC(model_11, type = "HC")))

# Inquiry variable
model_12 <- lm(inquiry ~ Treatment_Recoded * ind_pop, data = data)
robust_se_12 <- sqrt(diag(vcovHC(model_12, type = "HC")))

# Generate LaTeX Table 5 Indigenous Proportion-Conditional Results for All Dependent Variables 
stargazer(model_9, model_10, model_11, model_12, type = "latex",
          title = "Indigenous Proportion-Conditional Results for All Dependent Variables",
          se = list(robust_se_9, robust_se_10, robust_se_11, robust_se_12),
          dep.var.labels = c("Response Received",
                             "Name Greeting",
                             "Warm Response",
                             "Resolved Inquiry"),
          covariate.labels = c("Treatment (Indigenous)",
                               "Treatment (Mestiza)",
                               "Indigenous Population",
                               "Treatment (Indigenous) x Indigenous Population",
                               "Treatment (Mestiza) x Indigenous Population"),
          omit.stat = c("f", "ser"),
          no.space = TRUE,
          out = "output/indigenous_interaction_models_table.tex")

#==========================================
# CONDUCT LEAVE-ONE-OUT ANALYSES (GENERAL)
#==========================================

# This function runs Leave-One-Out (LOO) analyses for the models
# and generates summary statistics and plots for each term.
# It takes a list of models, their labels, and other metadata.
loo_summary_by_dv <- function(models, model_labels, data,
                              terms, dv_name, alpha = 0.05,
                              response_flag = "response") {
  if (length(models) != length(model_labels)) {
    stop("The number of models and model labels must match.")
  }

  results_list <- list()

  # Loop over each model and label to compute LOO diagnostics
  for (j in seq_along(models)) {
    model <- models[[j]]
    label <- model_labels[[j]]
    model_data <- model.frame(model)
    n <- nrow(model_data)
    coef_names <- names(coef(model))
    vcov_fn <- vcovHC(model, type = "HC")
    se_full <- sqrt(diag(vcov_fn))
    full_coefs <- coef(model)
    full_pvals <- coeftest(model, vcov. = vcov_fn)[, 4]

    for (term in terms) {
      # Skip term if not in model
      if (!term %in% coef_names) next

      full_estimate <- full_coefs[term]
      full_pval <- full_pvals[term]
      full_sign <- sign(full_estimate)
      full_sig <- full_pval < alpha

      estimates <- numeric(n)
      pvalues <- numeric(n)
      signs <- character(n)
      sig_status <- logical(n)

      for (i in 1:n) {
        # Refit model excluding one observation
        loo_data <- model_data[-i, ]
        loo_model <- lm(formula(model), data = loo_data)
        loo_se <- sqrt(diag(vcovHC(loo_model, type = "HC")))
        loo_coef <- coef(loo_model)[term]
        loo_pval <- coeftest(loo_model,
                             vcov. = vcovHC(loo_model, type = "HC"))[term, 4]

        # Track estimate, p-value, sign flip, significance change
        estimates[i] <- loo_coef
        pvalues[i] <- loo_pval
        signs[i] <- ifelse(sign(loo_coef) == full_sign, "Same", "Flipped")
        sig_status[i] <- loo_pval < alpha
      }

      # Summarize LOO results for current term and model
      summary_row <- data.frame(
        DV = dv_name,
        Model = label,
        Term = term,
        Full_Estimate = full_estimate,
        Mean_Loo_Estimate = mean(estimates, na.rm = TRUE),
        SD_Loo_Estimate = sd(estimates, na.rm = TRUE),
        Min_Loo_Estimate = min(estimates, na.rm = TRUE),
        Max_Loo_Estimate = max(estimates, na.rm = TRUE),
        Sign_Flips = sum(signs == "Flipped", na.rm = TRUE),
        Sig_Changes = sum(sig_status != full_sig, na.rm = TRUE),
        Full_Model_Significant = full_sig
      )

      results_list[[length(results_list) + 1]] <- summary_row
    }
  }

  # Combine all model-term summaries into a single data frame
  final_df <- do.call(rbind, results_list)
  return(final_df)
}

#--------------
# LOO SUMMARY
#--------------

# Run LOO summary for each dependent variable

results_response <- loo_summary_by_dv(
  models = list(model_1, model_5, model_9),
  model_labels = c("Main", "Coalition Interaction", "Indig Pop Interaction"),
  data = data,
  terms = c("Treatment_RecodedIndigenous", "Treatment_RecodedMestiza"),
  dv_name = "response"
)

results_greeting <- loo_summary_by_dv(
  models = list(model_2, model_6, model_10),
  model_labels = c("Main", "Coalition Interaction", "Indig Pop Interaction"),
  data = data,
  terms = c("Treatment_RecodedIndigenous", "Treatment_RecodedMestiza"),
  dv_name = "greeting"
)

results_warmth <- loo_summary_by_dv(
  models = list(model_3, model_7, model_11),
  model_labels = c("Main", "Coalition Interaction", "Indig Pop Interaction"),
  data = data,
  terms = c("Treatment_RecodedIndigenous", "Treatment_RecodedMestiza"),
  dv_name = "warmth"
)

results_inquiry <- loo_summary_by_dv(
  models = list(model_4, model_8, model_12),
  model_labels = c("Main", "Coalition Interaction", "Indig Pop Interaction"),
  data = data,
  terms = c("Treatment_RecodedIndigenous", "Treatment_RecodedMestiza"),
  dv_name = "inquiry"
)

# Combine all LOO summaries into one final table
results_combined <- rbind(results_response, results_greeting,
                          results_warmth, results_inquiry)

# Export combined LOO results to CSV
write.csv(results_combined, "loo_results.csv", row.names = FALSE)